##################################
#### GOV 2001: Advanced Quantitative Research Methodology
#### Replication Paper
#### Cai et al. (2015) Social Networks and the Decision to Insure
#### Emma Clarke, Isabelle Feldhaus, Jinyi Zhu
##################################

# Load packages
library(ggplot2)
library(knitr)
library(xtable)
library(readstata13)
library(dplyr)
library(AER)
library(plm)
library(multiwayvcov)
library(magrittr)
library(miceadds)
library(car)
library(lmtest)
library(sandwich)
library(psych)
library(stargazer)
library(haven)
library(Amelia)
library(mice)
library(lattice)
library(VIM)
library(mitools)
library(miceadds)

# Load data
data <- read.dta13("0422analysis.dta")

# Section 1: Summary statistics ---------------------------------------------------

# 1. Table 1, mean and std
panelA <- data %>% 
  select(male, age, agpop, educ, ricearea_2010, rice_inc, disaster_yes, disaster_loss, risk_averse, disaster_prob, understanding) %>% 
  summarise_all(funs(n = sum(!is.na(.)), mean = mean(., na.rm = TRUE), sd = sd(., na.rm = TRUE))) %>% 
  matrix(nrow = 11, ncol = 3) %>% 
  data.frame() %>% 
  rename(n = X1, mean = X2, sd = X3) %>% 
  set_rownames(c("male", "age", "agpop", "educ", "ricearea_2010", "rice_inc", "disaster_yes", "disaster_loss", "risk_averse", "disaster_prob", "understanding"))

panelB <- data %>% 
  select(network_obs, network_rate_preintensive, network_twoside, network_second) %>% 
  summarise_all(funs(n = sum(!is.na(.)), mean = mean(., na.rm = TRUE), sd = sd(., na.rm = TRUE))) %>% 
  matrix(nrow = 4, ncol = 3) %>% 
  data.frame() %>% 
  rename(n = X1, mean = X2, sd = X3) %>% 
  set_rownames(c("network_obs", "network_rate_preintensive", "network_twoside", "network_second"))

panelC <- data %>% 
  select(indegree, path_out_ind, eigenvector) %>% 
  summarise_all(funs(n = sum(!is.na(.)), mean = mean(., na.rm = TRUE), sd = sd(., na.rm = TRUE))) %>% 
  matrix(nrow = 3, ncol = 3) %>% 
  data.frame() %>% 
  rename(n = X1, mean = X2, sd = X3) %>% 
  set_rownames(c("indegree", "path_out_ind", "eigenvector"))

takeup_summary <- data %>% 
  select(takeup_survey) %>% 
  summarise_all(funs(n = sum(!is.na(.)), mean = mean(., na.rm = TRUE), sd = sd(., na.rm = TRUE))) %>% 
  matrix(nrow = 1, ncol = 3) %>% 
  data.frame() %>% 
  rename(n = X1, mean = X2, sd = X3) %>% 
  mutate(session = "total")

data$session <- 0
data$session[data$delay == 0 & data$intensive == 0] <- 11
data$session[data$delay == 0 & data$intensive == 1] <- 12
data$session[data$delay == 1 & data$intensive == 0] <- 21
data$session[data$delay == 1 & data$intensive == 1] <- 22
data$session <- as.factor(data$session)
as.data.frame(table(data$session))

takeup_by_session <- data %>% 
  select(takeup_survey, info_none, session) %>% 
  filter(info_none == 1) %>% 
  group_by(session) %>% 
  summarise(n = sum(!is.na(takeup_survey)), mean = mean(takeup_survey, na.rm = TRUE), sd = sd(takeup_survey, na.rm = TRUE)) %>% 
  data.frame()

panelD <- rbind(takeup_by_session, takeup_summary)

# LateX tables by panel
xtable(panelA)
xtable(panelB)
xtable(panelC)
xtable(panelD)

# 2. Table A1, check randomization by sessions
anova(aov(male ~ session, data = data))
anova(aov(age ~ session, data = data))
anova(aov(agpop ~ session, data = data))
anova(aov(educ ~ session, data = data))
anova(aov(ricearea_2010 ~ session, data = data))
anova(aov(rice_inc ~ session, data = data))
anova(aov(disaster_yes ~ session, data = data))
anova(aov(disaster_loss ~ session, data = data))


# Section 2: Intensive Session Effect ---------------------------------------------------
# With extension on bootstrap cluster standard errors
dat <- read.dta13("0422analysis.dta")
#The dataset is now called dat - SORRY!
##################Table 2 in the paper #####################
###Table2, col 1 - intensive session effect
round1dat = dat[dat$delay == 0, ]

lm_t2c1 <- lm(takeup_survey ~ intensive + male + age + agpop + ricearea_2010 + literacy + 
                village, data = round1dat)

#RCSE
vcov_t2c1 <- cluster.vcov(lm_t2c1, round1dat$address)
rcse_t2c1 <- sqrt(diag(vcov_t2c1))
coeftest(lm_t2c1, vcov_t2c1)

#BCSE
bvcov_t2c1 <- cluster.boot(lm_t2c1, round1dat$address, boot_type = "wild")
bcse_t2c1 <- sqrt(diag(bvcov_t2c1))


###check the effects of household covariates
lm_t2c1_nox <- lm(takeup_survey ~ intensive + factor(village), data = round1dat)
vcov_t2c1_nox <- cluster.vcov(lm_t2c1_nox, round1dat$address)
coeftest(lm_t2c1_nox, vcov_t2c1_nox)


# Section 3: Social Network Effect ---------------------------------------------------
###Table 2, col 2
dat_delay_noinfo <- dat[dat$delay == 1 & dat$info_none == 1, ]
lm_t2c2 <- lm(takeup_survey ~ network_rate_preintensive + male + age + agpop + ricearea_2010 + literacy + 
                intensive + risk_averse + disaster_prob + friend1 + friend2 + friend3 + friend4 + friend5 + village,
              data = dat_delay_noinfo)

#RCSE
vcov_t2c2 <- cluster.vcov(lm_t2c2, dat_delay_noinfo$address)
rcse_t2c2 <- sqrt(diag(vcov_t2c2))
coeftest(lm_t2c2, vcov_t2c2)

#BCSE
bvcov_t2c2 <- cluster.boot(lm_t2c2, dat_delay_noinfo$address, boot_type = "wild")
bcse_t2c2 <- sqrt(diag(bvcov_t2c2))


###check the effects of household covariates
lm_t2c2_nox <- lm(takeup_survey ~ network_rate_preintensive 
                  + friend1 + friend2 + friend3 + friend4 + friend5 + factor(village),
                  data = dat[dat$delay == 1 & dat$info_none == 1, ])
vcov_t2c2_nox <- cluster.vcov(lm_t2c2_nox, dat[dat$delay == 1 & dat$info_none == 1, ]$address)
coeftest(lm_t2c2_nox, vcov_t2c2_nox)

###Table 2, col 3
lm_t2c3 <- lm(takeup_survey ~ network_rate_preintensive + network_rate_presimple + intensive 
              + friend1 + friend2 + friend3 + friend4 + friend5 + village,
              data = dat_delay_noinfo)

#RCSE
vcov_t2c3 <- cluster.vcov(lm_t2c3, dat_delay_noinfo$address)
rcse_t2c3 <- sqrt(diag(vcov_t2c3))


#BCSE
bvcov_t2c3 <- cluster.boot(lm_t2c3, dat_delay_noinfo$address, boot_type = "wild")
bcse_t2c3 <- sqrt(diag(bvcov_t2c3))

###Table 2, col 4
lm_t2c4 <- lm(takeup_survey ~ network_rate_preintensive + intensive + network_rate_preintensive:intensive + male + age + agpop + ricearea_2010
              + literacy + risk_averse + disaster_prob + friend1 + friend2 + friend3 + friend4 + friend5 + village
              , data = dat_delay_noinfo)

#RCSE
vcov_t2c4 <- cluster.vcov(lm_t2c4, dat_delay_noinfo$address)
rcse_t2c4 <- sqrt(diag(vcov_t2c4))
coeftest(lm_t2c4, vcov_t2c4)

#BCSE
bvcov_t2c4 <- cluster.boot(lm_t2c4, dat_delay_noinfo$address, boot_type = "wild")
bcse_t2c4 <- sqrt(diag(bvcov_t2c4))

###check the effects of household covariates
lm_t2c4_nox <- lm(takeup_survey ~ network_rate_preintensive + intensive + network_rate_preintensive:intensive 
                  + friend1 + friend2 + friend3 + friend4 + friend5 + factor(village)
                  , data = dat[dat$delay == 1 & dat$info_none == 1, ])
vcov_t2c4_nox <- cluster.vcov(lm_t2c4_nox, dat[dat$delay == 1 & dat$info_none == 1, ]$address)
rcse_t2c4 <- sqrt(diag(vcov_t2c4))
coeftest(lm_t2c4_nox, vcov_t2c4_nox)

###Table 2, col 5
lm_t2c5 <- lm(takeup_survey ~ network_onlyone + network_onlytwo + network_twomore + intensive 
              + network_onlyone:intensive + network_onlytwo:intensive + network_twomore:intensive +
                + male + age + agpop + ricearea_2010 + literacy + risk_averse + disaster_prob 
              + friend1 + friend2 + friend3 + friend4 + friend5 + village
              , data = dat_delay_noinfo)

#RCSE
vcov_t2c5 <- cluster.vcov(lm_t2c5, dat_delay_noinfo$address)
rcse_t2c5 <- sqrt(diag(vcov_t2c5))
coeftest(lm_t2c5, vcov_t2c5)

#BCSE
bvcov_t2c5 <- cluster.boot(lm_t2c5, dat_delay_noinfo$address, boot_type = "wild")
bcse_t2c5 <- sqrt(diag(bvcov_t2c5))

###Table 2, col 6
dat$nofriend = 1
dat$nofriend[dat$delay == 1 & dat$info_none == 1 & dat$network_yes == 1] = 0
dat_t2c6 = dat[ (dat$delay == 0 | (dat$delay == 1 & dat$info_none == 1 & dat$nofriend == 1)), ]
dat_t2c6$inter = dat_t2c6$intensive * dat_t2c6$delay
lm_t2c6 <- lm(takeup_survey ~ intensive + delay + intensive*delay 
              + male + age + agpop + ricearea_2010 + literacy + risk_averse + disaster_prob 
              + village
              , data = dat_t2c6)
#RCSE
vcov_t2c6 <- cluster.vcov(lm_t2c6, dat_t2c6$address)
rcse_t2c6 <- sqrt(diag(vcov_t2c6))
coeftest(lm_t2c6, vcov_t2c6)

#BCSE
bvcov_t2c6 <- cluster.boot(lm_t2c6, dat_t2c6$address, boot_type = "wild")
bcse_t2c6 <- sqrt(diag(vcov_t2c6))

###Make the Table!!!
stargazer(lm_t2c1, lm_t2c2, lm_t2c3, lm_t2c4, lm_t2c5, lm_t2c6, se=list(rcse_t2c1, rcse_t2c2, rcse_t2c3, rcse_t2c4,rcse_t2c5, rcse_t2c6), 
          omit = c("village", "friend1", "friend2", "friend3", "friend4", "friend5"), style = "aer", no.space = T, df = FALSE)

#BCSE
stargazer(lm_t2c1, lm_t2c2, lm_t2c3, lm_t2c4, lm_t2c5, lm_t2c6, se=list(bcse_t2c1, bcse_t2c2, bcse_t2c3, bcse_t2c4, bcse_t2c5, bcse_t2c6), 
          omit = c("village", "friend1", "friend2", "friend3", "friend4", "friend5"), style = "aer", no.space = T, df = FALSE)


##################Table 3 in the paper #####################
###Table 3, col 1
lm_t3c1 <- lm(takeup_survey ~ network_twoside
              + male + age + agpop + ricearea_2010 + literacy + intensive + risk_averse + disaster_prob + friend1 
              + friend2 + friend3 + friend4 + friend5 + factor(village),
              data = dat[dat$delay == 1 & dat$info_none == 1, ])
vcov_t3c1 <- cluster.vcov(lm_t3c1, dat[dat$delay == 1 & dat$info_none == 1, ]$address)
rcse_t3c1 <- sqrt(diag(vcov_t3c1))
coeftest(lm_t3c1, vcov_t3c1)

###Table 3, col 2
lm_t3c2 <- lm(takeup_survey ~ network_second
              + male + age + agpop + ricearea_2010 + literacy + intensive + risk_averse + disaster_prob + friend1 
              + friend2 + friend3 + friend4 + friend5 + factor(village),
              data = dat[dat$delay == 1 & dat$info_none == 1, ])
vcov_t3c2 <- cluster.vcov(lm_t3c2, dat[dat$delay == 1 & dat$info_none == 1, ]$address)
rcse_t3c2 <- sqrt(diag(vcov_t3c2))
coeftest(lm_t3c2, vcov_t3c2)

###Table 3, col 3
lm_t3c3 <- lm(takeup_survey ~ network_onlyone + network_onlytwo + network_twomore
              + male + age + agpop + ricearea_2010 + literacy + intensive + risk_averse + disaster_prob + friend1 
              + friend2 + friend3 + friend4 + friend5 + factor(village),
              data = dat[dat$delay == 1 & dat$info_none == 1, ])
vcov_t3c3 <- cluster.vcov(lm_t3c3, dat[dat$delay == 1 & dat$info_none == 1, ]$address)
rcse_t3c3 <- sqrt(diag(vcov_t3c3))
coeftest(lm_t3c3, vcov_t3c3)

###Make the Table!!!
stargazer(lm_t3c1, lm_t3c2, lm_t3c3, se=list(rcse_t3c1, rcse_t3c2, rcse_t3c3), 
          omit = c("village", "friend1", "friend2", "friend3", "friend4", "friend5"), style = "aer", no.space = T, df = FALSE)

# Section 5: Social learning of insurance benefits ------------------------

# Generating additional variables
data$inter_net_knowledge <- data$network_rate_preintensive * data$knowledge_network
data$inter_inten_delay <- data$intensive * data$delay
data$inter_inten_netyes <- data$intensive * data$network_yes

# 3. Test learning of insurance benefits
# Table 5
table5.col1 <- data %>% 
  filter(info_none == 1) %>% 
  lm(understanding ~ intensive + delay + inter_inten_delay + male + age + agpop + ricearea_2010 + literacy + risk_averse + disaster_prob + village, data =.) %>% 
  coeftest(., vcov = cluster.vcov(., cluster = data$address[data$info_none==1]))

table5.col1_model <- data %>% 
  filter(info_none == 1) %>% 
  lm.cluster(., understanding ~ intensive + delay + inter_inten_delay + male + age + agpop + ricearea_2010 + literacy + risk_averse + disaster_prob + village, 
             cluster = "address")
linearHypothesis(table5.col1_model, c("intensive = 0", "inter_inten_delay = 0"), test = "F")

table5.col2 <- data %>% 
  filter(info_none == 1 & delay == 1) %>% 
  lm(understanding ~ intensive + network_yes + inter_inten_netyes + male + age + agpop + ricearea_2010 + literacy + risk_averse + disaster_prob + village, data =.) %>% 
  coeftest(., vcov = cluster.vcov(., cluster = data$address[data$info_none==1 & data$delay==1]))

table5.col3 <- data %>% 
  filter(info_none == 1 & delay == 1) %>% 
  lm(understanding ~ intensive + network_rate_preintensive + male + age + agpop + ricearea_2010 + literacy + risk_averse + disaster_prob + village, data =.) %>% 
  coeftest(., vcov = cluster.vcov(., cluster = data$address[data$info_none==1 & data$delay==1]))

table5.col4 <- data %>% 
  filter(info_none == 1 & delay == 1) %>% 
  ivreg(understanding ~ knowledge_network + intensive + male + age + agpop + ricearea_2010 + literacy + risk_averse + disaster_prob + village | 
          network_rate_preintensive + intensive + male + age + agpop + ricearea_2010 + literacy + risk_averse + disaster_prob + village, 
        data =.) %>% 
  coeftest(., vcov = cluster.vcov(., cluster = data$address[data$delay==1 & data$info_none==1]))

# LateX tables
stargazer(table5.col1, table5.col2, table5.col3, table5.col4, omit = "village", style = "aer", no.space = T, df = FALSE)


# Section 6: Effect of default ------------------------
######### ######### Analysis file: section 6 ######### ######### 

######### Table A5: Characteristics of insurance takers by default option

# This table is just checking the randomization of the default option.

dat.A5 <- dat[dat$takeup_survey==1 & dat$delay==0,]
table.A5 <- data.frame(t(data.frame(dat.A5 %>% group_by(default) %>% summarize(
  "Gender of household head" = mean(male,na.rm=T), 
  "Age" = mean(age,na.rm=T), 
  "Household size"=mean(agpop,na.rm=T),
  "Area of rice production" = mean(ricearea_2010,na.rm=T),
  "Education"=mean(literacy,na.rm=T),
  "Trust in government"=mean(general_trust,na.rm=T),
  "Post-session insurance knowledge score"=mean(understanding,na.rm=T),
  "Perceived probability of future disasters (percent)"=mean(disaster_prob,na.rm=T),
  "Revealed purchase decision to friends"=mean(reveal,na.rm=T)))))

colnames(table.A5) <- c("NotBuy", "Buy")
table.A5$difference <- table.A5$NotBuy - table.A5$Buy 
View(table.A5)

######### Table 6, column 1: Default effect

# This is the first stage regression in the IV. It tells us that the "default" rate is a valid instrument 
# for studying the relationship between take-up in the first round and take-up in the later rounds.

## Run the regression model
firststage.table6.col1 <- lm(takeup_survey ~ default + male + age + agpop + ricearea_2010 + literacy + 
                               risk_averse + disaster_prob + village, data=dat[dat$delay==0,])

## Estimate clustered standard errors
vcov.table6.col1 <- cluster.vcov(firststage.table6.col1, dat$address[dat$delay==0])
rcse.t6c1 <- sqrt(diag(vcov.table6.col1))
coeftest(firststage.table6.col1, vcov.table6.col1)


# Section 7: Effect of overall take-up ------------------------
######### ######### Analysis file: section 7 ######### ######### 

## Now, we're estimating the relationship between first-round take-up and second-round take-up.

## COLUMN 2: We start with just a regular OLS regression.

## First, generate a variable for the mean first-round takeup rate in each village:
pre_takeup_rate <- data.frame( dat[dat$delay==0,] %>% group_by(address) %>% summarize(pre_takeup_rate = mean(takeup_survey)) )
dat <- merge(dat, pre_takeup_rate, by="address")
View(data.frame(dat$address, dat$delay, dat$takeup_survey, dat$pre_takeup_rate))

## Next, generate interactions between takeup rates and information (whether 2nd rount partcipants were given information 
# about takeup in the first round). 
dat$inter_pre_none <- dat$pre_takeup_rate * dat$info_none
dat$inter_pre_none_iv <- dat$default * dat$info_none

## Next, run the regression:
table6.col2 <- lm(takeup_survey ~ pre_takeup_rate + info_none + inter_pre_none + male + age + agpop + ricearea_2010 +
                    literacy + intensive + risk_averse + disaster_prob + village, dat = dat[dat$delay==1,])
# Estimate clustered standard errors
vcov.table6.col2 <- cluster.vcov(table6.col2, dat$address[dat$delay==1])
rcse.t6c2 <- sqrt(diag(vcov.table6.col2))
coeftest(table6.col2, vcov.table6.col2)
# How many observations?
nrow(dat[dat$delay==1,])

## COLUMN 3: Next, we run the IV, using the randomly assigned default AND the interaction of default with 'no info' as instruments
table6.col3 <- ivreg(formula = takeup_survey ~ pre_takeup_rate + inter_pre_none + info_none + male + age +
                       agpop + ricearea_2010 + literacy + intensive + risk_averse + disaster_prob + village,
                     ~ default + inter_pre_none_iv + info_none + male + age +
                       agpop + ricearea_2010 + literacy + intensive + risk_averse + disaster_prob + village,
                     data = dat[dat$delay==1,])
vcov.table6.col3 <- cluster.vcov(table6.col3, dat$address[dat$delay==1])
rcse.t6c3 <- sqrt(diag(vcov.table6.col3))
coeftest(table6.col3, vcov.table6.col3) 

## COLUMN 5: Next, we run the IV, using the randomly assigned default as the instrument
table6.col5 <- ivreg(formula = takeup_survey ~ pre_takeup_rate + male + age +
                       agpop + ricearea_2010 + literacy + intensive + risk_averse + disaster_prob + village,
                     ~ default + male + age +
                       agpop + ricearea_2010 + literacy + intensive + risk_averse + disaster_prob + village,
                     data = dat[dat$delay==1 & dat$info_none==0,])
vcov.table6.col5 <- cluster.vcov(table6.col5, dat$address[dat$delay==1 & dat$info_none==0])
rcse.t6c5 <- sqrt(diag(vcov.table6.col5))
coeftest(table6.col5, vcov.table6.col5) 

## COLUMN 7: Next, we run the IV, using the randomly assigned default as the instrument
table6.col7 <- ivreg(formula = takeup_survey ~ pre_takeup_rate + male + age +
                       agpop + ricearea_2010 + literacy + intensive + risk_averse + disaster_prob + village,
                     ~ default + male + age +
                       agpop + ricearea_2010 + literacy + intensive + risk_averse + disaster_prob + village,
                     data = dat[dat$delay==1 & dat$info_none==1,])
vcov.table6.col7 <- cluster.vcov(table6.col7, dat$address[dat$delay==1 & dat$info_none==1])
rcse.t6c7 <- sqrt(diag(vcov.table6.col7))
coeftest(table6.col7, vcov.table6.col7) 

## COLUMN 4: Next, running the reduced form regresssions
table6.col4 <- lm(takeup_survey ~ default + info_none + inter_pre_none_iv + male + age + agpop + ricearea_2010 +
                    literacy + intensive + risk_averse + disaster_prob + village, data=dat[dat$delay==1,])
vcov.table6.col4 <- cluster.vcov(table6.col4, dat$address[dat$delay==1])
rcse.t6c4 <- sqrt(diag(vcov.table6.col4))
coeftest(table6.col4, vcov.table6.col4) 

## COLUMN 6: 
table6.col6 <- lm(takeup_survey ~ default + male + age + agpop + ricearea_2010 +
                    literacy + intensive + risk_averse + disaster_prob + village, data=dat[dat$delay==1 & dat$info_none==0,])
vcov.table6.col6 <- cluster.vcov(table6.col6, dat$address[dat$delay==1 & dat$info_none==0])
rcse.t6c6 <- sqrt(diag(vcov.table6.col6))
coeftest(table6.col6, vcov.table6.col6) 

## COLUMN 8:
table6.col8 <- lm(takeup_survey ~ default + male + age + agpop + ricearea_2010 +
                    literacy + intensive + risk_averse + disaster_prob + village, data=dat[dat$delay==1 & dat$info_none==1,])
vcov.table6.col8 <- cluster.vcov(table6.col8, dat$address[dat$delay==1 & dat$info_none==1])
rcse.t6c8 <- sqrt(diag(vcov.table6.col8))
coeftest(table6.col8, vcov.table6.col8) 

## LATEX TABLE
stargazer(firststage.table6.col1, table6.col2, table6.col3, table6.col4, table6.col5, table6.col6, table6.col7, table6.col8, 
          se=list(rcse.t6c1, rcse.t6c2, rcse.t6c3, rcse.t6c4, rcse.t6c5, rcse.t6c6, rcse.t6c7, rcse.t6c8), 
          omit = c("village", "friend1", "friend2", "friend3", "friend4", "friend5"), 
          style = "aer", no.space = T, df = FALSE)


# Section 8: Effect of friends' take-up -----------------------------------
# Generating additional variables
pre_takeup_rate <- data.frame( data[data$delay==0,] %>% group_by(address) %>% summarize(pre_takeup_rate = mean(takeup_survey)) )
data <- merge(data, pre_takeup_rate, by = "address")
data$inter_pre_none <- data$pre_takeup_rate * data$info_none
data$inter_pre_none_iv <- data$default * data$info_none
data$inter_pre_takeup <- data$network_rate_presession * data$pre_takeup_rate
data$inter_pre_takeup_iv <- data$network_rate_presession * data$default
data$inter_prenetwork_none <- data$network_rate_pretakeup * data$info_none
data$inter_prenetwork_none_iv1 <- data$inter_pre_takeup_iv * data$info_none

# Table 7, OLS and IV estimation, columns 1,2,3,5,7 (paper: columns 1,2,4,6)

table7.notincluded <- data %>% # Not included in paper, but in Table7.xls in data file.
  filter(delay == 1 & info_takeup_rate != 1) %>% 
  lm.cluster(., network_rate_pretakeup ~ inter_pre_takeup_iv + male + age + agpop + ricearea_2010 + literacy + intensive + village, 
             cluster = "address") %>% 
  summary()

table7.col1 <- data %>% 
  filter(delay == 1 & info_takeup_rate != 1) %>% 
  lm.cluster(., takeup_survey ~ pre_takeup_rate + network_rate_pretakeup + info_none + inter_pre_none + inter_prenetwork_none + male + age + agpop + ricearea_2010 + literacy + intensive + village, 
             cluster = "address") %>% 
  summary()

table7.col1_model <- data %>% 
  filter(delay == 1 & info_takeup_rate != 1) %>% 
  lm.cluster(., takeup_survey ~ pre_takeup_rate + network_rate_pretakeup + info_none + inter_pre_none + inter_prenetwork_none + male + age + agpop + ricearea_2010 + literacy + intensive + village, 
             cluster = "address")

linearHypothesis(table7.col1_model, c("pre_takeup_rate = 0", "inter_pre_none = 0"), test = "F")
linearHypothesis(table7.col1_model, c("network_rate_pretakeup = 0", "inter_prenetwork_none = 0"), test = "F")

table7.col2 <- data %>% 
  filter(delay == 1 & info_takeup_rate != 1) %>% 
  ivreg(takeup_survey ~ network_rate_pretakeup + pre_takeup_rate + inter_pre_none + inter_prenetwork_none + info_none + male + age + agpop + ricearea_2010 + literacy + intensive + village | 
          inter_pre_takeup_iv + network_rate_presession + default + inter_pre_none_iv + inter_prenetwork_none_iv1 + info_none + male + age + agpop + ricearea_2010 + literacy + intensive + village, 
        data =.) %>% 
  coeftest(., vcov = cluster.vcov(., cluster = data$address[data$delay==1 & data$info_takeup_rate!=1]))

table7.col2_iv <- data %>% 
  filter(delay == 1 & info_takeup_rate != 1) %>% 
  ivreg(takeup_survey ~ network_rate_pretakeup + pre_takeup_rate + inter_pre_none + inter_prenetwork_none + info_none + male + age + agpop + ricearea_2010 + literacy + intensive + village | 
          inter_pre_takeup_iv + network_rate_presession + default + inter_pre_none_iv + inter_prenetwork_none_iv1 + info_none + male + age + agpop + ricearea_2010 + literacy + intensive + village, 
        data =.)

linearHypothesis(table7.col2_iv, c("pre_takeup_rate = 0", "inter_pre_none = 0"), test = "F")
linearHypothesis(table7.col2_iv, c("network_rate_pretakeup = 0", "inter_prenetwork_none = 0"), test = "F")

table7.col4 <- data %>% 
  filter(delay == 1 & info_takeup_list == 1) %>% 
  ivreg(takeup_survey ~ network_rate_pretakeup + pre_takeup_rate + male + age + agpop + ricearea_2010 + literacy + intensive + village | 
          inter_pre_takeup_iv + default + male + age + agpop + ricearea_2010 + literacy + intensive + village, 
        data =.) %>% 
  coeftest(., vcov = cluster.vcov(., cluster = data$address[data$delay==1 & data$info_takeup_list==1])) 
# Robust SEs are slightly different here compared to the paper. 
# Differences between `ivreg` and `ivregress 2sls` in Stata could potentially point to cause.
# All coefficients match.

table7.col6 <- data %>% 
  filter(delay == 1 & info_none == 1) %>% 
  ivreg(takeup_survey ~ network_rate_pretakeup + pre_takeup_rate + male + age + agpop + ricearea_2010 + literacy + intensive + village | 
          inter_pre_takeup_iv + default + male + age + agpop + ricearea_2010 + literacy + intensive + village, 
        data =.) %>% 
  coeftest(., vcov = cluster.vcov(., cluster = data$address[data$delay==1 & data$info_none==1])) 
# Robust SEs are slightly different here compared to the paper. 
# Differences between `ivreg` and `ivregress 2sls` in Stata could potentially point to cause.
# All coefficients match.

# Table 7, reduced form esimation, columns 4,6,8 (paper: columns 3,5,7)

table7.col3 <- data %>% 
  filter(delay == 1 & info_takeup_rate != 1) %>% 
  lm.cluster(., takeup_survey ~ inter_pre_takeup_iv + default + inter_pre_none_iv + inter_prenetwork_none_iv1 + info_none + male + age + agpop + ricearea_2010 + literacy + intensive + village, 
             cluster = "address") %>% 
  summary()

table7.col5 <- data %>% 
  filter(delay == 1 & info_takeup_list == 1) %>% 
  lm.cluster(., takeup_survey ~ inter_pre_takeup_iv + default + male + age + agpop + ricearea_2010 + literacy + intensive + village, 
             cluster = "address") %>% 
  summary()

table7.col7 <- data %>% 
  filter(delay == 1 & info_none == 1) %>% 
  lm.cluster(., takeup_survey ~ inter_pre_takeup_iv + default + male + age + agpop + ricearea_2010 + literacy + intensive + village, 
             cluster = "address") %>% 
  summary()

# Extensions ---------------------------------------------------------------

# Placebo test for IV assumptions (i.e. testing whether IV has strong first stage on other variables)
understanding.default <- data %>% 
  filter(delay == 0) %>% 
  lm(understanding ~ default + male + age + agpop + ricearea_2010 + literacy + risk_averse + disaster_prob + village, data = .) %>% 
  coeftest(., vcov = cluster.vcov(., cluster = data$address[data$delay==0]))

disaster_loss.default <- data %>% 
  filter(delay == 0) %>% 
  lm(disaster_loss ~ default + male + age + agpop + ricearea_2010 + literacy + risk_averse + disaster_prob + village, data = .) %>% 
  coeftest(., vcov = cluster.vcov(., cluster = data$address[data$delay==0]))

insurance_repay.default <- data %>% 
  filter(delay == 0) %>% 
  lm(insurance_repay ~ default + male + age + agpop + ricearea_2010 + literacy + risk_averse + disaster_prob + village, data = .) %>% 
  coeftest(., vcov = cluster.vcov(., cluster = data$address[data$delay==0]))

insurance_buy.default <- data %>% 
  filter(delay == 0) %>% 
  lm(insurance_buy ~ default + male + age + agpop + ricearea_2010 + literacy + risk_averse + disaster_prob + village, data = .) %>% 
  coeftest(., vcov = cluster.vcov(., cluster = data$address[data$delay==0]))

general_trust.default <- data %>% 
  filter(delay == 0) %>% 
  lm(general_trust ~ default + male + age + agpop + ricearea_2010 + literacy + risk_averse + disaster_prob + village, data = .) %>% 
  coeftest(., vcov = cluster.vcov(., cluster = data$address[data$delay==0]))

disaster_yes.default <- data %>% 
  filter(delay == 0) %>% 
  lm(disaster_yes ~ default + male + age + agpop + ricearea_2010 + literacy + risk_averse + disaster_prob + village, data = .) %>% 
  coeftest(., vcov = cluster.vcov(., cluster = data$address[data$delay==0]))

knowledge_network.default <- data %>% 
  filter(delay == 0) %>% 
  lm(knowledge_network ~ default + male + age + agpop + ricearea_2010 + literacy + risk_averse + disaster_prob + village, data = .) %>% 
  coeftest(., vcov = cluster.vcov(., cluster = data$address[data$delay==0]))

# None of the above tests show that `default` is a significant predictor of selected outcomes

######### ######### Extension: missing data analysis ######### ######### 

# Calculate the portion missingness of each variable
pMiss <- function(x){sum(is.na(x))/length(x)}
missing.prop <- data.frame( apply(dat,2,pMiss) )
xtable(missing.prop)

# Some of the variables (ID, address, region, village, takeup survey, buy insurance...) are complete

# It makes sense that "reveal" is missing for 61.7% of people because this question
# was only answered by those who were in the 1st Round group.

# The network variables are all missing among 6-7% of respondents. Maybe the questions weren't asked of them?
# Network knowledge is missing for 24% of participants.
# The paper explains that this variable is set as missing if a farmer has no friends who were in first-round sessions.

# "disaster_yes" is missing among 9% of participants, and "disaster_loss' is missing among 48% of participants. 
# However, neither of these variables is used in any of the models (just presented for balance)

# Missingness in other variables is relatively low ("understanding" has 3.6% missingness; others <2%).

# Visualize the missingness

# For the visual, I'm not going to include the complete variables (not enough space).
# I'm also not going to include the disaster variables, since they don't show up in the models.
# And I'm not going to include "reveal" (since it isn't really "missing").
# And I'm only going to include one of the network variables since they all have the same missingness pattern.
vars.missingviz <- c("age", "agpop", "educ", "rice_inc", "ricearea_2010", "male", "understanding",
                     "literacy", "network_obs")

aggr_plot <- aggr(dat[,vars.missingviz], col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE,
                  labels=vars.missingviz, cex.axis=0.7, cex.numbers=0.8, gap=1, prop=c(TRUE,FALSE),
                  ylab=c("Histogram of missing data","Pattern"))

# 4370 of the observations have no missingness.
# There doesn't seem to be any clear pattern to the missingness. (It is not monotone).

# I'm going to impute the missing data and see if our results change in any meaningful way.
# I'll run the Amelia imputation algorithm (Expectation Maximization, assuming multivariate normality).
# I'm focusing on the main results in Table 2.

dat$inter_inten_delay <- dat$intensive * dat$delay

# First re-running the first column of Table 5, using imputed data

dat$nofriend = 1
dat$nofriend[dat$delay == 1 & dat$info_none == 1 & dat$network_yes == 1] = 0

vars.subforimp <- c("intensive", "delay", "inter_inten_delay", "male", "age", "agpop", 
                    "ricearea_2010","literacy", "risk_averse", "disaster_prob","understanding", 
                    "takeup_survey", "info_none", "village", "address", "network_rate_preintensive",
                    "friend1", "friend2", "friend3", "friend4", "friend5", "network_rate_presimple",
                    "network_onlyone", "network_onlytwo", "network_twomore", "nofriend")
dat.subforimp <- dat[,vars.subforimp]
a.out <- amelia(dat.subforimp, m=5, idvars=c("village", "address")) # creating 5 imputed datasets

##################Table 2 in the paper: repeating with imputed data #####################
###Table2, col 1 - intensive session effect
tab2col1.out <- lapply(a.out$imputations, function(x){
  lm(takeup_survey ~ intensive + male + age + agpop + ricearea_2010 + literacy + 
       village, data = x[x$delay == 0,])
})
betas.tab2col1 <- MIextract(tab2col1.out, fun=coef) # Extract betas
vars.tab2col1 <- MIextract(tab2col1.out, fun=vcov) # Extract variances
tab2col1 <- summary(MIcombine(betas.tab2col1,vars.tab2col1)) # Combine using Rubin's rules

###check the effects of household covariates
tab2col1.out.nox <- lapply(a.out$imputations, function(x){
  lm(takeup_survey ~ intensive + factor(village), data = x[x$delay == 0,])
})
betas.tab2col1.nox <- MIextract(tab2col1.out.nox, fun=coef) # Extract betas
vars.tab2col1.nox <- MIextract(tab2col1.out.nox, fun=vcov) # Extract variances
tab2col1.nox <- summary(MIcombine(betas.tab2col1.nox,vars.tab2col1.nox)) # Combine using Rubin's rules


# Section 3: Social Network Effect ---------------------------------------------------
###Table 2, col 2

tab2col2.out <- lapply(a.out$imputations, function(x){
  lm(takeup_survey ~ network_rate_preintensive + male + age + agpop + ricearea_2010 + literacy + 
       intensive + risk_averse + disaster_prob + friend1 + friend2 + friend3 + friend4 + friend5 + factor(village),
     data = x[x$delay == 1 & x$info_none == 1, ])
})
betas.tab2col2 <- MIextract(tab2col2.out, fun=coef) # Extract betas
vars.tab2col2 <- MIextract(tab2col2.out, fun=vcov) # Extract variances
tab2col2 <- summary(MIcombine(betas.tab2col2,vars.tab2col2)) # Combine using Rubin's rules

###check the effects of household covariates
tab2col2.out.nox <- lapply(a.out$imputations, function(x){
  lm(takeup_survey ~ network_rate_preintensive 
     + friend1 + friend2 + friend3 + friend4 + friend5 + factor(village),
     data = x[x$delay == 1 & x$info_none == 1, ])
})
betas.tab2col2.nox <- MIextract(tab2col2.out.nox, fun=coef) # Extract betas
vars.tab2col2.nox <- MIextract(tab2col2.out.nox, fun=vcov) # Extract variances
tab2col2.nox <- summary(MIcombine(betas.tab2col2.nox,vars.tab2col2.nox)) # Combine using Rubin's rules

###Table 2, col 3
tab2col3.out <- lapply(a.out$imputations, function(x){
  lm(takeup_survey ~ network_rate_preintensive + network_rate_presimple + intensive + friend1 + friend2 + friend3 + friend4 + friend5 + factor(village),
     data = x[x$delay == 1 & x$info_none == 1, ])
})
betas.tab2col3 <- MIextract(tab2col3.out, fun=coef) # Extract betas
vars.tab2col3 <- MIextract(tab2col3.out, fun=vcov) # Extract variances
tab2col3 <- summary(MIcombine(betas.tab2col3,vars.tab2col3)) # Combine using Rubin's rules

###Table 2, col 4
tab2col4.out <- lapply(a.out$imputations, function(x){
  lm(takeup_survey ~ network_rate_preintensive + intensive + network_rate_preintensive:intensive + male + age + agpop + ricearea_2010
     + literacy + risk_averse + disaster_prob + friend1 + friend2 + friend3 + friend4 + friend5 + factor(village)
     , data = x[x$delay == 1 & x$info_none == 1, ])
})
betas.tab2col4 <- MIextract(tab2col4.out, fun=coef) # Extract betas
vars.tab2col4 <- MIextract(tab2col4.out, fun=vcov) # Extract variances
tab2col4 <- summary(MIcombine(betas.tab2col4,vars.tab2col4)) # Combine using Rubin's rules


###check the effects of household covariates
tab2col4.out.nox <- lapply(a.out$imputations, function(x){
  lm(takeup_survey ~ network_rate_preintensive + intensive + network_rate_preintensive:intensive 
     + friend1 + friend2 + friend3 + friend4 + friend5 + factor(village)
     , data = x[x$delay == 1 & x$info_none == 1, ])
})
betas.tab2col4.nox <- MIextract(tab2col4.out.nox, fun=coef) # Extract betas
vars.tab2col4.nox <- MIextract(tab2col4.out.nox, fun=vcov) # Extract variances
tab2col4.nox <- summary(MIcombine(betas.tab2col4.nox,vars.tab2col4.nox)) # Combine using Rubin's rules

###Table 2, col 5
tab2col5.out <- lapply(a.out$imputations, function(x){
  lm(takeup_survey ~ network_onlyone + network_onlytwo + network_twomore + intensive 
     + network_onlyone:intensive + network_onlytwo:intensive + network_twomore:intensive +
       + male + age + agpop + ricearea_2010 + literacy + risk_averse + disaster_prob 
     + friend1 + friend2 + friend3 + friend4 + friend5 + factor(village)
     , data = x[x$delay == 1 & x$info_none == 1, ])
})
betas.tab2col5 <- MIextract(tab2col5.out, fun=coef) # Extract betas
vars.tab2col5 <- MIextract(tab2col5.out, fun=vcov) # Extract variances
tab2col5 <- summary(MIcombine(betas.tab2col5,vars.tab2col5)) # Combine using Rubin's rules

###Table 2, col 6
tab2col6.out <- lapply(a.out$imputations, function(x){
  lm(takeup_survey ~ intensive + delay + intensive*delay 
     + male + age + agpop + ricearea_2010 + literacy + risk_averse + disaster_prob 
     + factor(village)
     , data = x[ (x$delay == 0 | (x$delay == 1 & x$info_none == 1 & x$nofriend == 1)), ])
})
betas.tab2col6 <- MIextract(tab2col6.out, fun=coef) # Extract betas
vars.tab2col6 <- MIextract(tab2col6.out, fun=vcov) # Extract variances
tab2col6 <- summary(MIcombine(betas.tab2col6,vars.tab2col6)) # Combine using Rubin's rules

###Print results!!!
tab2col1
tab2col2
tab2col3
tab2col4
tab2col5
tab2col6

unique(dat$address)

######### ######### Extension: heterogeneous treamtent effects ######### ############ 

# The instrument is "default"
# The treatment is "pre_takeup_rate"
# The outcome is "takeup_survey" 
# This analysis is done among those with dat$delay==1 & dat$info_none==0 (since the authors only observed an effect among this group)

######### Creating binary treatment variable

# In this analysis, we redefine the treatment as "majority takeup in the first-round." 
# The IV analysis is now asking: what is the impact of living in a village with > 50% 
# insurance uptake in the first round on insurance uptake in the second round?
# (Rather than: what is the impact of the takeup rate in the first round on the takeup rate in the second round?)

# Creating a binary treatment variable:

dat$pre_takeup_maj <- rep(NA)
dat$pre_takeup_maj[dat$pre_takeup_rate >= 0.5] <- 1
dat$pre_takeup_maj[dat$pre_takeup_rate < 0.5] <- 0
table(dat$pre_takeup_maj)

# If we run the IV analysis with this binary treatment variable:
iv.binary <- ivreg(formula = takeup_survey ~ pre_takeup_maj + male + age +
                     agpop + ricearea_2010 + literacy + intensive + risk_averse + disaster_prob + village,
                   ~ default + male + age +
                     agpop + ricearea_2010 + literacy + intensive + risk_averse + disaster_prob + village,
                   data = dat[dat$delay==1 & dat$info_none==0,])
vcov.iv.binary <- cluster.vcov(iv.binary, dat$address[dat$delay==1 & dat$info_none==0])
coeftest(iv.binary, vcov.iv.binary) # estimated treatment effect is 0.62040558
summary(iv.binary)$df[1] + summary(iv.binary)$df[2] # number of observations in the model is 1378

######### Complier characteristics

# 2 WAYS OF LOOKING AT COMPLIERS IN THIS PAPER:
## INDIVIDUAL LEVEL approach: compliers defined as *individuals* induced to take up insurance by the default
## ADDRESS LEVEL approach: compliers defined as *addresses* (a.k.a. natural villages) induced to have a 
## higher takeup rate by the default. While the paper presents the "individual level" first-stage, 
## the IV models (2SLS) are actually run using the "address level" first-stage. We can compare these: 

# Individual version:
firststage.table6.col1 <- lm(takeup_survey ~ default + male + age + agpop + ricearea_2010 + literacy +  # individual effect
                               risk_averse + disaster_prob + village, data=dat[dat$delay==0,])
summary(firststage.table6.col1)$coef[2,] # 0.12

# Group treatment with continuous treatment variable, individual-level data:
firststage.group.cont <- lm(pre_takeup_rate ~ default + male + age + agpop + ricearea_2010 + literacy + 
                              risk_averse + disaster_prob + village, data=dat[dat$delay==0,])
summary(firststage.group.cont)$coef[2,] # 0.12

# Group treatment with binary treatment variable, individual-level data:
firststage.group.bin <- lm(pre_takeup_maj ~ default + male + age + agpop + ricearea_2010 + literacy + 
                             risk_averse + disaster_prob + village, data=dat[dat$delay==0,])
summary(firststage.group.bin)$coef[2,] # 0.20 


# What can we learn about the compliers? 

## The portion of the population that are compliers is equal to the difference between the portion 
## with instrument *on* who are treated and the portion with instrument *off* who are treated. 

## "INDIVIDUAL" level approach

mean(dat$takeup_survey[dat$delay==0 ]) # overall takeup rate in first round = 0.43
mean(dat$takeup_survey[dat$delay==0 & dat$default==0]) # 0.353 take-up among those with instrument off
mean(dat$takeup_survey[dat$delay==0 & dat$default==1]) # 0.507 take-up among those with instrument on
mean(dat$takeup_survey[dat$delay==0 & dat$default==1]) - mean(dat$takeup_survey[dat$delay==0 & dat$default==0]) # 0.15% of population = compliers

## "ADDRESS" LEVEL approach (aka "natural village" level): 

# Creating a collapsed dataset:
dat.collapsed <- data.frame( dat %>% group_by(address, info_none, delay) %>% summarize (default = mean(default),
                                                                                        pre_takeup_maj = mean(pre_takeup_maj),
                                                                                        pre_takeup_rate = mean(pre_takeup_rate),
                                                                                        portion.male = mean(male, na.rm=T),
                                                                                        meanagpop = mean(agpop, na.rm=T),
                                                                                        meanage = mean(age, na.rm=T),
                                                                                        meaneduc=mean(educ, na.rm=T),
                                                                                        meandisaster_prob=mean(disaster_prob, na.rm=T),
                                                                                        meanricearea_2010=mean(ricearea_2010, na.rm=T),
                                                                                        meanliteracy=mean(literacy, na.rm=T),
                                                                                        meanrisk_averse=mean(risk_averse, na.rm=T)))
location.vars <- c("address", "village", "region")
justlocation <- dat[,location.vars]
dat.collapsed <- merge(dat.collapsed, justlocation, by="address")

dat.collapsed$count <- rep(1)

dat.collapsed %>% group_by(default) %>% summarize(count=sum(count)) # there is one "address" that includes people with different values of "default" (this shouldn't be the case)
View(dat[dat$address=="xinlian1314",]) # one person in this "address" is coded as if they received the default. not clear why. problem for analysis, so re-coding this person.
dat$default[dat$address=="xinlian1314"] <- rep(0)

# Re-make the collapsed dataset
dat$count <- rep(1)
dat.collapsed <- data.frame( dat %>% group_by(address, delay, info_none) %>% summarize (default = mean(default),
                                                                                        pre_takeup_maj = mean(pre_takeup_maj),
                                                                                        pre_takeup_rate = mean(pre_takeup_rate),
                                                                                        portion.male = mean(male, na.rm=T),
                                                                                        meanagpop = mean(agpop, na.rm=T),
                                                                                        meanage = mean(age, na.rm=T),
                                                                                        meaneduc=mean(educ, na.rm=T),
                                                                                        meandisaster_prob=mean(disaster_prob, na.rm=T),
                                                                                        meanricearea_2010=mean(ricearea_2010, na.rm=T),
                                                                                        meanliteracy=mean(literacy, na.rm=T),
                                                                                        meanrisk_averse=mean(risk_averse, na.rm=T),
                                                                                        count=sum(count)))
location.vars <- c("address", "village", "region")
justlocation <- dat[,location.vars]
justlocation <- unique(justlocation)
dat.collapsed <- merge(dat.collapsed, justlocation, by="address")

# Look at takeup rates:

mean(dat.collapsed$pre_takeup_rate[dat.collapsed$delay==1 & dat.collapsed$info_none==0]) # mean takeup rate 0.4475436 across "addresses" in the info group
mean(dat.collapsed$pre_takeup_rate[dat.collapsed$delay==1 & dat.collapsed$info_none==0 & dat.collapsed$default==0]) # mean address-level takeup in default=0 villages: 0.3644713
mean(dat.collapsed$pre_takeup_rate[dat.collapsed$delay==1 & dat.collapsed$info_none==0 & dat.collapsed$default==1]) # mean address-level takeup in default=1 villages: 0.5347183

mean(dat.collapsed$pre_takeup_maj[dat.collapsed$delay==1 & dat.collapsed$info_none==0]) # portion addresses w/ maj takeup 0.4216867
mean(dat.collapsed$pre_takeup_maj[dat.collapsed$delay==1 & dat.collapsed$info_none==0 & dat.collapsed$default==0]) # portion default=0 addresses w/ maj takeup  0.2941176
mean(dat.collapsed$pre_takeup_maj[dat.collapsed$delay==1 & dat.collapsed$info_none==0 & dat.collapsed$default==1]) # portion default=1 addresses w/ maj takeup 0.5555556
mean(dat.collapsed$pre_takeup_maj[dat.collapsed$delay==1 & dat.collapsed$info_none==0 & dat.collapsed$default==1]) - 
  mean(dat.collapsed$pre_takeup_maj[dat.collapsed$delay==1 & dat.collapsed$info_none==0 & dat.collapsed$default==0]) # portion compliers by this metric = 0.2614379

## Characteristics of the complier population: can compare the ratio of the first-stage among a particular sub-group of villages
## to the first-stage in all of the villages overall.

firststage.all <- lm(pre_takeup_maj ~ default + portion.male + meanage + meanagpop + meanricearea_2010 + meanliteracy + 
                       meanrisk_averse + meandisaster_prob + village, data=dat.collapsed[dat.collapsed$delay==0,])
summary(firststage.all)$coef[2,] # 0.17793975 (close to the individual-level coefficient) 

vcov_all <- cluster.vcov(firststage.all, dat.collapsed[dat.collapsed$delay == 0,]$address)
rcse_all <- sqrt(diag(vcov_all))


# villages with above average "proportion male"

mean.portionmale <- mean(dat.collapsed$portion.male, na.rm=T) # 0.9122471

firststage.80plusmale <- lm(pre_takeup_maj ~ default + portion.male + meanage + meanagpop + meanricearea_2010 + meanliteracy + 
                              meanrisk_averse + meandisaster_prob + village, data=dat.collapsed[dat.collapsed$delay==0 & dat.collapsed$portion.male > mean.portionmale,])
summary(firststage.80plusmale)$coef[2,] # 0.22271101 (still similar to individual-level coefficient)

vcov_moremale <- cluster.vcov(firststage.80plusmale, dat.collapsed$address[dat.collapsed$delay==0 & dat.collapsed$portion.male > mean.portionmale])
rcse_moremale <- sqrt(diag(vcov_moremale))

# villages with above average "mean disaster probability"

meandisasterprob <- mean(dat.collapsed$meandisaster_prob) # 33.82403
meandisasterprob

firststage.highdisasterprob<- lm(pre_takeup_maj ~ default + portion.male + meanage + meanagpop + meanricearea_2010 + meanliteracy + 
                                   meanrisk_averse + meandisaster_prob + village, data=dat.collapsed[dat.collapsed$delay==0 & dat.collapsed$meandisaster_prob > meandisasterprob,])
summary(firststage.highdisasterprob)$coef[2,] # 0.01894394 (low)

firststage.lowdisasterprob<- lm(pre_takeup_maj ~ default + portion.male + meanage + meanagpop + meanricearea_2010 + meanliteracy + 
                                  meanrisk_averse + meandisaster_prob + village, data=dat.collapsed[dat.collapsed$delay==0 & dat.collapsed$meandisaster_prob < meandisasterprob,])
summary(firststage.lowdisasterprob)$coef[2,] # 0.38550596 (high: villages with low disaster probability more influenced by the default; this makes sense)

vcov_disast.high <- cluster.vcov(firststage.highdisasterprob, dat.collapsed$address[dat.collapsed$delay==0 & dat.collapsed$meandisaster_prob > meandisasterprob])
rcse_disast.high <- sqrt(diag(vcov_disast.high))

vcov_disast.low <- cluster.vcov(firststage.lowdisasterprob, dat.collapsed$address[dat.collapsed$delay==0 & dat.collapsed$meandisaster_prob < meandisasterprob])
rcse_disast.low <- sqrt(diag(vcov_disast.low))

# villages with above average risk aversion

meanriskaversion <- mean(dat.collapsed$meanrisk_averse) # 0.1889684
meanriskaversion

firststage.highriskaversion <- lm(pre_takeup_maj ~ default + portion.male + meanage + meanagpop + meanricearea_2010 + meanliteracy + 
                                    meanrisk_averse + meandisaster_prob + village, data=dat.collapsed[dat.collapsed$delay==0 & dat.collapsed$meanrisk_averse > meanriskaversion,])
summary(firststage.highriskaversion)$coef[2,] # 0.1521795

firststage.lowriskaversion <- lm(pre_takeup_maj ~ default + portion.male + meanage + meanagpop + meanricearea_2010 + meanliteracy + 
                                   meanrisk_averse + meandisaster_prob + village, data=dat.collapsed[dat.collapsed$delay==0 & dat.collapsed$meanrisk_averse <= meanriskaversion,])
summary(firststage.lowriskaversion)$coef[2,] # 0.1037084 


vcov_riskav.high <- cluster.vcov(firststage.highriskaversion, dat.collapsed$address[dat.collapsed$delay==0 & dat.collapsed$meanrisk_averse > meanriskaversion])
rcse_riskav.high <- sqrt(diag(vcov_disast.high))

vcov_riskav.low <- cluster.vcov(firststage.lowriskaversion, dat.collapsed$address[dat.collapsed$delay==0 & dat.collapsed$meanrisk_averse <= meanriskaversion])
rcse_risav.low <- sqrt(diag(vcov_disast.low))

# over and under the mean age

meanage <- mean(dat.collapsed$meanage, na.rm=T) # 51.61675
meanage

firststage.ageover51 <- lm(pre_takeup_maj ~ default + portion.male + meanage + meanagpop + meanricearea_2010 + meanliteracy + 
                             meanrisk_averse + meandisaster_prob + village, data=dat.collapsed[dat.collapsed$delay==0 & dat.collapsed$meanage > meanage,])
summary(firststage.ageover51)$coef[2,] # 0.388269179 (high)

firststage.ageunder51 <- lm(pre_takeup_maj ~ default + portion.male + meanage + meanagpop + meanricearea_2010 + meanliteracy + 
                              meanrisk_averse + meandisaster_prob + village, data=dat.collapsed[dat.collapsed$delay==0 & dat.collapsed$meanage < meanage,])
summary(firststage.ageunder51)$coef[2,] # -0.0906974 (negative!!! this is a monotonicity violation)

firststage.ageunder40 <- lm(pre_takeup_maj ~ default + portion.male + meanage + meanagpop + meanricearea_2010 + meanliteracy + 
                              meanrisk_averse + meandisaster_prob + village, data=dat.collapsed[dat.collapsed$delay==0 & dat.collapsed$meanage < 40,])
summary(firststage.ageunder51)$coef[2,] # -0.0906974 (also negative)

vcov_age.high <- cluster.vcov(firststage.ageover51, dat.collapsed$address[dat.collapsed$delay==0 & dat.collapsed$meanage > meanage])
rcse_age.high <- sqrt(diag(vcov_disast.high))

vcov_age.low <- cluster.vcov(firststage.ageunder51, dat.collapsed$address[dat.collapsed$delay==0 & dat.collapsed$meanage < meanage])
rcse_age.low <- sqrt(diag(vcov_disast.low))

## Summarizing all of this in a table:

stargazer(firststage.all, firststage.highdisasterprob, firststage.lowdisasterprob, 
          firststage.highriskaversion, firststage.lowriskaversion, firststage.ageover51, firststage.ageunder51,
          se=list(rcse_all, rcse_disast.high, rcse_disast.low,rcse_riskav.high, rcse_risav.low,rcse_age.high, rcse_age.low), 
          style = "aer", no.space = T, df = FALSE)

######### Now, examining heterogeneous treatment effects

## Share of study population in different groups:

# share of the individuals (second round, no info) in each group:
share.alwaystakers <- sum(dat$delay==1 & dat$info_none==1 & dat$default==0 & dat$pre_takeup_maj==1 )/ 
  sum(dat$delay==1 & dat$info_none==1) 
share.alwaystakers # 15%
share.nevertakers <-  sum(dat$delay==1 & dat$info_none==1 & dat$default==1 & dat$pre_takeup_maj==0 )/ 
  sum(dat$delay==1 & dat$info_none==1)
share.nevertakers # 24%
share.compliers <- 1 - share.alwaystakers - share.nevertakers # 0.603 (remember from above that the portion of villages induced to have majority takeup was 0.2614)

# share of the villages in each group
dat$count <- rep(1)
subset.vars <- c("address", "default", "pre_takeup_maj", "count", "info_none")
dat.subset <- dat[dat$info_none==0 & dat$delay==1,subset.vars]
dat.subset <- data.frame( dat.subset %>% group_by(address) %>% summarize(default=mean(default), pre_takeup_maj=mean(pre_takeup_maj), n=sum(count)) )
share.alwaystakers.vill <- sum(dat.subset$default == 0 & dat.subset$pre_takeup_maj==1) / sum(dat.subset$default == 0)
share.alwaystakers.vill # 0.2941176
share.nevertakers.vill <- sum(dat.subset$default == 1 & dat.subset$pre_takeup_maj==0) / sum(dat.subset$default == 1) 
share.nevertakers.vill # 0.4444444 (most villages are "never takers" in this set-up)
share.compliers.vill <- 1 - share.alwaystakers.vill - share.nevertakers.vill
share.compliers.vill # 0.2614379
table(dat.subset$n) # variation in how many people there are in each village in the "no info" group

share.compliers.vill2 <- mean(dat.collapsed$pre_takeup_maj[dat.collapsed$info_none==0 & dat.collapsed$default==1]) - mean(dat.collapsed$pre_takeup_maj[dat.collapsed$info_none==0 & dat.collapsed$default==0]) # proportion of those with instrument off who were treated
share.compliers.vill2  # portion compliers by this metric = 0.2614379 (same as above)

# set up data for calculations

iv.binary <- ivreg(formula = takeup_survey ~ pre_takeup_maj + male + age +
                     agpop + ricearea_2010 + literacy + intensive + risk_averse + disaster_prob + village,
                   ~ default + male + age +
                     agpop + ricearea_2010 + literacy + intensive + risk_averse + disaster_prob + village,
                   data = dat[dat$delay==1 & dat$info_none==0,])
vcov.iv.binary <- cluster.vcov(iv.binary, dat$address[dat$delay==1 & dat$info_none==0])
coeftest(iv.binary, vcov.iv.binary) # estimated treatment effect is 0.62040558
summary(iv.binary)$df[1] + summary(iv.binary)$df[2] # number of observations in the model is 1378

iv.vars <- rownames(coeftest(iv.binary, vcov.iv.binary))
dat$`(Intercept)` <- rep(1)
dat$villagebeidang <- ifelse(dat$village=="beidang", 1, 0)
dat$villagebeilian <- ifelse(dat$village=="beilian", 1, 0)
dat$villagebeixing <- ifelse(dat$village=="beixing", 1, 0)
dat$villagecaijia <- ifelse(dat$village=="caijia", 1, 0)
dat$villagechaxi <- ifelse(dat$village=="chaxi", 1, 0)
dat$villagedaqiao <- ifelse(dat$village=="daqiao", 1, 0)
dat$villagedaxi <- ifelse(dat$village=="daxi", 1, 0)
dat$villagedayu <- ifelse(dat$village=="dayu", 1, 0)
dat$villagedazhou <- ifelse(dat$village=="dazhou", 1, 0)
dat$villagedongan <- ifelse(dat$village=="dongan", 1, 0)
dat$villagedukou <- ifelse(dat$village=="dukou", 1, 0)
dat$villagefusheng <- ifelse(dat$village=="fusheng", 1, 0)
dat$villagefuzhou <- ifelse(dat$village=="fuzhou", 1, 0)
dat$villagegangtou <- ifelse(dat$village=="gangtou", 1, 0)
dat$villagegangxia <- ifelse(dat$village=="gangxia", 1, 0)
dat$villageguojia <- ifelse(dat$village=="guojia", 1, 0)
dat$villagehefeng <- ifelse(dat$village=="hefeng", 1, 0)
dat$villagehelin <- ifelse(dat$village=="helin", 1, 0)
dat$villagehongxing <- ifelse(dat$village=="hongxing", 1, 0)
dat$villagehuangshan <- ifelse(dat$village=="huangshan", 1, 0)
dat$villagejingang <- ifelse(dat$village=="jingang", 1, 0)
dat$villagejinggang <- ifelse(dat$village=="jinggang", 1, 0)
dat$villagelianhe <- ifelse(dat$village=="lianhe", 1, 0)
dat$villagelianqian <- ifelse(dat$village=="lianqian", 1, 0)
dat$villagelianxing <- ifelse(dat$village=="lianxing", 1, 0)
dat$villagelongqing <- ifelse(dat$village=="longqing", 1, 0)
dat$villagelusikou <- ifelse(dat$village=="lusikou", 1, 0)
dat$villagemazhou <- ifelse(dat$village=="mazhou", 1, 0)
dat$villageminzhu <- ifelse(dat$village=="minzhu", 1, 0)
dat$villageshigang <- ifelse(dat$village=="shigang", 1, 0)
dat$villagewanshang <- ifelse(dat$village=="wanshang", 1, 0)
dat$villagexiabao <- ifelse(dat$village=="xiabao", 1, 0)
dat$villagexianghu <- ifelse(dat$village=="xianghu", 1, 0)
dat$villagexiaofang <- ifelse(dat$village=="xiaofang", 1, 0)
dat$villagexiashatou <- ifelse(dat$village=="xiashatou", 1, 0)
dat$villagexiecheng <- ifelse(dat$village=="xiecheng", 1, 0)
dat$villagexihe <- ifelse(dat$village=="xihe", 1, 0)
dat$villagexihu <- ifelse(dat$village=="xihu", 1, 0)
dat$villagexilian <- ifelse(dat$village=="xilian", 1, 0)
dat$villagexinguang <- ifelse(dat$village=="xinguang", 1, 0)
dat$villagexingzeng <- ifelse(dat$village=="xingzeng", 1, 0)
dat$villagexinlian <- ifelse(dat$village=="xinlian", 1, 0)
dat$villageyanjiang <- ifelse(dat$village=="yanjiang", 1, 0)
dat$villageyazhou <- ifelse(dat$village=="yazhou", 1, 0)
dat$villageyongfeng <- ifelse(dat$village=="yongfeng", 1, 0)
dat$villagezhangxi <- ifelse(dat$village=="zhangxi", 1, 0)
dat$villagezixi <- ifelse(dat$village=="zixi", 1, 0)
iv.dat <- dat[,iv.vars]
iv.coefs <- coeftest(iv.binary, vcov.iv.binary)[,1]
iv.dat.means <- apply(iv.dat, 2, mean, na.rm=T)

# "average untreated outcome for compliers"
iv.dat.means["pre_takeup_maj"] <- 0
compliers.untreated <- iv.dat.means %*% iv.coefs # 0.231948 
compliers.untreated

# "average treated outcome for compliers"
iv.dat.means["pre_takeup_maj"] <- 1
compliers.treated <- iv.dat.means %*% iv.coefs 
compliers.treated # 0.8523536 

# "average untreated outcome for never-takers" 
nevertakers.untreated <- mean(dat$takeup_survey[dat$delay==1 & dat$default==1 & dat$pre_takeup_maj==0 & dat$info_none==0]) # 0.41 

# "average treated outcome for always-takers" 
alwaystakers.treated <- mean(dat$takeup_survey[dat$delay==1 & dat$default==0 & dat$pre_takeup_maj==1 & dat$info_none==0]) # 0.56 